Load key packages

data(nyc_airbnb)

Brainstorm questions

Where are max and min airbnb prices What are avg prices? What factors affect prices? Where are prices going up and down over time? What’s the relationship bw number of reviews and avg review? Does room type affect availability? What’s the location of the unit? What areas are popular? Popularity = number of units? avg price? Are there repeat hosts? *If so, what does that mean?

Prices…

nyc_airbnb |> 
  summarize(min_price=min(price, na.rm=TRUE), 
  max_price=max(price, na.rm = TRUE))
## # A tibble: 1 × 2
##   min_price max_price
##       <dbl>     <dbl>
## 1        10     10000
nyc_airbnb |> 
  summarize(mean(price, na.rm = TRUE))
## # A tibble: 1 × 1
##   `mean(price, na.rm = TRUE)`
##                         <dbl>
## 1                        145.

distrib of prices in 5 boroughs

nyc_airbnb %>%
  ggplot(aes(x = price, fill = neighbourhood_group)) +
  geom_histogram() +
  facet_grid(. ~ neighbourhood_group) +
  scale_x_continuous(limits = c(0, 250))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3810 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).

does room type affect availability?

ggplot(data = nyc_airbnb, aes(x = room_type, y = availability_365)) +
  geom_violin() 

mean price for entire home vs private room vs shared room

nyc_airbnb |>
  group_by(room_type) |>
  summarize(mean_price = mean(price))
## # A tibble: 3 × 2
##   room_type       mean_price
##   <chr>                <dbl>
## 1 Entire home/apt      207. 
## 2 Private room          87.5
## 3 Shared room           70.2

Comparing number of reviews vs. average review score

ggplot(data=nyc_airbnb, aes(x=number_of_reviews, y=review_scores_location, na.rm=TRUE)) + geom_point()
## Warning: Removed 10037 rows containing missing values or values outside the scale range
## (`geom_point()`).

Repeat hosts

nyc_airbnb %>%
  filter(calculated_host_listings_count > 1) %>%
  distinct(host_id, host_name, calculated_host_listings_count) |> 
  arrange(desc(calculated_host_listings_count))
## # A tibble: 3,944 × 3
##     host_id host_name        calculated_host_listings_count
##       <dbl> <chr>                                     <dbl>
##  1 26377263 Stat                                         35
##  2 51913826 The Bowery House                             33
##  3   417504 Dana                                         28
##  4 81634538 Rio                                          20
##  5  8874674 Laura                                        18
##  6   440022 Petter                                       18
##  7 53213930 West 42nd Street                             16
##  8 47554473 Mae                                          15
##  9 31307789 Luffy                                        14
## 10 96098402 Carrie                                       13
## # ℹ 3,934 more rows
nyc_airbnb %>%
  filter(calculated_host_listings_count > 34)
## # A tibble: 35 × 17
##          id review_scores_location name    host_id host_name neighbourhood_group
##       <dbl>                  <dbl> <chr>     <dbl> <chr>     <chr>              
##  1 15057686                     NA Home 4…  2.64e7 Stat      Brooklyn           
##  2 15080936                     NA Home 4…  2.64e7 Stat      Brooklyn           
##  3 14776203                     10 Home 4…  2.64e7 Stat      Brooklyn           
##  4 15074005                     NA Home 4…  2.64e7 Stat      Brooklyn           
##  5  5866656                     NA Home 4…  2.64e7 Stat      Brooklyn           
##  6  5538353                     NA Home 4…  2.64e7 Stat      Brooklyn           
##  7  5632551                     10 Home 4…  2.64e7 Stat      Brooklyn           
##  8  7788565                     NA Home 4…  2.64e7 Stat      Brooklyn           
##  9  7789408                      9 Home4 …  2.64e7 Stat      Brooklyn           
## 10  7789213                     10 Home 4…  2.64e7 Stat      Brooklyn           
## # ℹ 25 more rows
## # ℹ 11 more variables: neighbourhood <chr>, lat <dbl>, long <dbl>,
## #   room_type <chr>, price <dbl>, minimum_nights <dbl>,
## #   number_of_reviews <dbl>, last_review <date>, reviews_per_month <dbl>,
## #   calculated_host_listings_count <dbl>, availability_365 <dbl>
nyc_airbnb %>%
  group_by(neighbourhood_group) %>%
  summarise(total_reviews = sum(number_of_reviews, na.rm = TRUE)) %>%
  arrange(desc(total_reviews))
## # A tibble: 5 × 2
##   neighbourhood_group total_reviews
##   <chr>                       <dbl>
## 1 Manhattan                  323941
## 2 Brooklyn                   263542
## 3 Queens                      66611
## 4 Bronx                        9897
## 5 Staten Island                4744
nyc_airbnb |> 
  group_by(neighbourhood_group, room_type) |> 
  summarize(median_price=median(price)) |> 
  pivot_wider(
    names_from= room_type,
    values_from=median_price
  )
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
## # A tibble: 5 × 4
## # Groups:   neighbourhood_group [5]
##   neighbourhood_group `Entire home/apt` `Private room` `Shared room`
##   <chr>                           <dbl>          <dbl>         <dbl>
## 1 Bronx                            100              55            43
## 2 Brooklyn                         145              65            40
## 3 Manhattan                        190              90            65
## 4 Queens                           119              60            39
## 5 Staten Island                    112.             55            25
nyc_airbnb |> 
  ggplot(aes(x=review_scores_location, y=price)) +
  geom_point()
## Warning: Removed 10037 rows containing missing values or values outside the scale range
## (`geom_point()`).

looking at locations in Manhattan

nyc_airbnb |> 
  filter(neighbourhood_group=="Manhattan",
         price<1000) |> 
  ggplot(aes(x=lat, y=long, color=price)) +
  geom_point(alpha=.1)

descending mean price by neighborhood

nyc_airbnb |> 
  filter(neighbourhood_group=="Manhattan",
         price<1000,
         room_type == "Entire home/apt") |> 
  group_by(neighbourhood) |> 
  summarize(mean_price=mean(price)) |> 
  arrange(desc(mean_price))
## # A tibble: 32 × 2
##    neighbourhood      mean_price
##    <chr>                   <dbl>
##  1 Tribeca                  358.
##  2 NoHo                     312.
##  3 Flatiron District        307.
##  4 SoHo                     296.
##  5 Theater District         282.
##  6 Midtown                  276.
##  7 Battery Park City        271.
##  8 Greenwich Village        256.
##  9 Chelsea                  255.
## 10 Financial District       250.
## # ℹ 22 more rows
nyc_airbnb |> 
  filter(neighbourhood_group=="Manhattan",
         price<1000,
         room_type == "Entire home/apt") |> 
  mutate(neighbourhood=fct_reorder(neighbourhood,price)) |> 
  ggplot(aes(x=neighbourhood, y=price)) +
  geom_violin() +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

Go back to that map

nyc_airbnb |> 
  filter(neighbourhood_group=="Manhattan",
         price<1000) |> 
  ggplot(aes(x=lat, y=long, color=price)) +
  geom_point(alpha=.1)

Leaflet package loaded–interactive map created below (look at link online) –slice giving us first 10 rows bc its a huge dataset –if we change from github doc to html doc we can knit and the leaflet map will show up

nyc_airbnb |> 
  filter(neighbourhood_group=="Manhattan",
         price<1000) |> 
  slice(1:100) |> 
  leaflet() |> 
  addTiles() |> 
  addCircleMarkers(~lat, ~long, radius=2)
pal <- colorNumeric(c("red", "green", "blue"),
                  domain = nyc_airbnb$ratings)
## Warning: Unknown or uninitialised column: `ratings`.
nyc_airbnb |> 
  filter(neighbourhood_group=="Manhattan",
         price<1000) |> 
  leaflet() |> 
  addProviderTiles(providers$CartoDB.Positron) |> 
  addCircleMarkers(~lat, ~long, color = ~pal(price), radius=2)